package org.xcolab.util.html; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility class to sanitize and format HTML inputs. */ public final class HtmlUtil { private HtmlUtil() { } /** * Removes all html form the input string * @param text unsafe input * @return input string without any html tags */ public static String cleanAll(String text) { return clean(text, Whitelist.none(), ""); } /** * Removes unsafe and structural html from the input string * @param text unsafe input * @return input string without dangerous and structural html tags */ public static String cleanMost(String text) { return clean(text, Whitelist.simpleText(), ""); } /** * Removes unsafe html from the input string * @param text unsafe input * @param baseUri used to evaluate relative links * @return input string without dangerous html tags */ public static String cleanSome(String text, String baseUri) { final Whitelist whitelist = Whitelist.basicWithImages(); whitelist.addAttributes("img", "style"); whitelist.addAttributes("a", "name"); whitelist.addAttributes("a", "class"); whitelist.preserveRelativeLinks(true); return clean(text, whitelist, baseUri); } /** * Removes html from the input string, allowing only tags as indicated by the whitelist. * @param text the unsafe input text * @param whitelist a list of allowed tags * @param baseUri used to evaluate relative links * @return input text without html tags other than those on the whitelist */ public static String clean(String text, Whitelist whitelist, String baseUri) { if (StringUtils.isEmpty(text)) { return ""; } Document doc = Jsoup.parse(text, baseUri); doc = new Cleaner(whitelist).clean(doc); // Adjust escape mode, http://stackoverflow.com/questions/8683018/jsoup-clean-without-adding-html-entities doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); return doc.body().html(); } /** * This method unescapes all of entities that are stored in the database, that are in xhml format * but those are not all html4 entities, the quote, is escaped as ' instead of "e; so this clears it up * @param body the text to be cleaned * @return unescaped text for emails */ public static String decodeHTMLEntitiesForEmail(String body){ return org.apache.commons.lang3.StringEscapeUtils.unescapeHtml4(body).replace("'","'"); } public static String makeRelativeLinksAbsolute(String html, String baseUrl) { return html.replaceAll("(href=[\"\'])/", "$1" + baseUrl + "/"); } public static String createLink(String url, String desc) { if (! url.contains("http://") && ! url.contains("https://")) { url = "http://" + url; } return "<a rel='nofollow' href='" + url + "'>" + desc + "</a>"; } public static String addHtmlLineBreaks(String content) { return content.replaceAll("\n", " <br />\n"); } public static String filterAndFormatContent(String content) { String tmp = content; if (! content.contains("<br")) { tmp = addHtmlLineBreaks(tmp); } tmp = linkifyUrls(tmp); tmp = tmp.replaceAll("\"", "'"); return tmp; } public static String linkifyUrls(String content) { Pattern existingLinksPattern = Pattern.compile("(<a[^>]*>[^<]*</a>|<img[^>]*>|<a[^>]*>)"); Matcher existingLinksMatcher = existingLinksPattern.matcher(content); List<Integer[]> linksBeginEnd = new ArrayList<>(); while (existingLinksMatcher.find()) { linksBeginEnd.add(new Integer[] {existingLinksMatcher.start(), existingLinksMatcher.end()}); } Pattern pattern = Pattern.compile("(http://|https://|www\\.)([{\\w-]*\\.)+\\w{1,4}([^\\s]*)"); Matcher matcher = pattern.matcher(content); StringBuilder strBuilder = new StringBuilder(); int lastIndex = 0; while (matcher.find()) { // check if this link isn't already part of existing <a href=... boolean partOfAnchor = false; for (Integer[] linkStartEnd: linksBeginEnd) { if (matcher.start() > linkStartEnd[0] && matcher.start() < linkStartEnd[1]) { partOfAnchor = true; break; } } if (partOfAnchor) { continue; } strBuilder.append(content.substring(lastIndex, matcher.start())); String url = content.substring(matcher.start(), matcher.end()); strBuilder.append(createLink(url, url)); strBuilder.append(content.substring(matcher.end(), matcher.end())); lastIndex = matcher.end(); } strBuilder.append(content.substring(lastIndex)); return strBuilder.toString(); } public static Document addNoFollowToLinkTagsInDocument(Document document){ for (Element aTagElement : document.select("a")) { if (!aTagElement.attr("rel").equals("nofollow")) { String linkURL = aTagElement.attr("href"); String linkText = aTagElement.text(); String linkWithNoFollow; if(linkText.equals("")) { linkWithNoFollow = createLink(linkURL, linkURL); } else { linkWithNoFollow = createLink(linkURL, linkText); } aTagElement.after(linkWithNoFollow); aTagElement.remove(); } } return document; } }